Read data

seu <- readRDS("/t1-data/project/tsslab/zhu/multiome/analysis_newref/preprocessing/rds/rna_atac_singlet/seuobj_rna64340.rds")
dim(seu)
## [1] 27599 64340

QC

DefaultAssay(seu) <- "RNA"
VlnPlot(
  object = seu,
  features = c("nCount_RNA",  "nFeature_RNA", "percent.mt",
               "nFrags","TSSEnrichment","NucleosomeRatio"),
  ncol = 3, group.by = "sample",
  pt.size = 0
)

table(seu$sample)
## 
##  10cit   10dp  16cit   16dp  22cit 22cit2   22dp   4mix 
##  10313   6725  13927  10196   8838    133   5989   8219
plot1 <- FeatureScatter(seu, feature1 = "nCount_RNA", feature2 = "percent.mt", group.by = "sample", pt.size = 0.1)
plot2 <- FeatureScatter(seu, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",group.by = "sample", pt.size = 0.1)
plot1 + plot2

Seurat pipeline of singlet GEX data - without doublets

DefaultAssay(seu) <- "RNA"
seu <- NormalizeData(seu, normalization.method = "LogNormalize", scale.factor = 10000)
seu <- FindVariableFeatures(seu, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(seu), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(seu)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1 + plot2
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 611 rows containing missing values (geom_point).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 611 rows containing missing values (geom_point).

all.genes <- rownames(seu)
seu <- ScaleData(seu, features = all.genes)
## Centering and scaling data matrix
seu <- RunPCA(seu, features = VariableFeatures(object = seu))
## PC_ 1 
## Positive:  loxl5b, col9a3, col9a1b, FQ323119.1, zmp:0000000760, col5a3a, znf385b, col2a1a, cmn, ntd5 
##     rcn3, col11a1a, tdgf1, col9a2, col2a1b, paqr4b, col8a1a, p4ha1b, loxl3b, CU655820.1 
##     plod1a, slc38a8b, lama5, ablim2, cnmd, caskin1, shha, emilin3a, znf385a, robo4 
## Negative:  cntfr, nrp2a, pdgfra, ednrab, mdkb, pax3a, cdh6, kif21b, rxraa, col18a1a 
##     col15a1b, alcama, CU467961.1, si:dkey-250k10.4, foxd3, kcnab1a, zic2b, itga9, crlf1a, sdk2b 
##     zgc:174888, lima1a, pmp22a, msx1b, cdh11, epha4b, celf2, dusp5, crestin, gphna 
## PC_ 2 
## Positive:  col4a5, actb2, cdon, plpp3, slit2, tram1, cdh6, ednrab, actn1, qkib 
##     her9, plod1a, p4ha1b, rrbp1b, pmp22a, bnc2, si:dkey-22o22.2, ccn1, CU467961.1, pdgfra 
##     cd151l, caskin1, ugcg, mdkb, kaznb, kdelr2b, kif21b, sec61a1l, creb5b, loxl5b 
## Negative:  txlnba, obsl1a, actc1b, nexn, desma, tnni2b.1, tpma, smyhc1, neb, chrng 
##     tmem38a, aldoab, srl, ttn.2, xirp1, acta1b, tnnc2, ryr1b, hsp90aa1.1, musk 
##     asb12b, ttn.1, trdn, BX571827.3, klhl31, chrnd, cacng1a, obscnb, ENSDARG00000112015, tnnt2c 
## PC_ 3 
## Positive:  spaca4l, krt4, cyt1, cyt1l, zgc:174938, cd9b, krt92, si:dkey-87o1.2, epcam, krt5 
##     cldne, si:ch211-195b11.3, krt17, cldn7b, agr1, cldnb, krt91, anxa1c, gpa33a, oclna 
##     si:cabz01007794.1, ppl, spint2, evplb, eppk1, gbgt1l4, zgc:153665, cfl1l, krt97, si:ch73-347e22.8 
## Negative:  epha4a, serpinh1b, foxd3, pax3a, lhfpl6, zic2b, mtss1lb, rxraa, gdf11, hspb1 
##     si:dkey-22o22.2, nav2a, gphna, qkib, itga9, kif21b, alg6, lpar1, foxd3-mCherry, kcnab1a 
##     cdon, cdh6, pdzd2, caskin1, slc2a11b, tspan36, BX663503.3, hoxc3a, ENSDARG00000024966, col15a1b 
## PC_ 4 
## Positive:  nrp2a, tns2a, ednrab, actb2, pcdh9, tspan36, f3b, pmp22a, bnc2, flna 
##     si:dkey-250k10.4, qkib, slc2a11b, alcama, phldb2a, CU138533.1, pdgfra, trpm1b, kif21b, dusp5 
##     tgfbr3, cax1, zgc:174888, crlf1a, aox5, slc2a15b, slc45a2, alx4a, bace2, crestin 
## Negative:  hoxc3a, mllt3, hoxc6b, fn1b, kif26ab, msgn1, pcdh8, large2, tbx16, her1 
##     hspb1, tbx16l, kank1a, ENSDARG00000104264, apoc1, rbm38, aopep, lpar1, draxin, adamts18 
##     samd10b, ism1, dlc, cyp26a1, epha4a, dld, tuba8l2, wnt8a.1, esrrga, gdf11 
## PC_ 5 
## Positive:  aox5, slc2a15b, cax1, slc45a2, CABZ01021592.1, trpm1b, gch2, fn1b, gpr143, si:dkey-106n21.1 
##     akr1b1.1, lrmda, pcdh8, syngr1a, tspan36, bace2, rab32a, pts, lpar1, pcdh9 
##     CU138533.1, her1, slc2a11b, cx30.3, samd10b, mocs1, slc2a15a, pax7b, tyr, tfeb 
## Negative:  ncam1a, plch1, epb41a, rfx4, negr1, ctnnd2b, crb2a, cep131, nfasca, ctnna2 
##     elavl3, brsk2b, rnf220a, chl1a, nbeaa, col18a1a, adgrv1, gadd45gb.1, epha7, dlb 
##     ebf2, dacha, ptprnb, epha4b, ek1, rassf7a, tmem108, inavaa, crb1, ptprna
# Examine and visualize PCA results a few different ways
print(seu[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  loxl5b, col9a3, col9a1b, FQ323119.1, zmp:0000000760 
## Negative:  cntfr, nrp2a, pdgfra, ednrab, mdkb 
## PC_ 2 
## Positive:  col4a5, actb2, cdon, plpp3, slit2 
## Negative:  txlnba, obsl1a, actc1b, nexn, desma 
## PC_ 3 
## Positive:  spaca4l, krt4, cyt1, cyt1l, zgc:174938 
## Negative:  epha4a, serpinh1b, foxd3, pax3a, lhfpl6 
## PC_ 4 
## Positive:  nrp2a, tns2a, ednrab, actb2, pcdh9 
## Negative:  hoxc3a, mllt3, hoxc6b, fn1b, kif26ab 
## PC_ 5 
## Positive:  aox5, slc2a15b, cax1, slc45a2, CABZ01021592.1 
## Negative:  ncam1a, plch1, epb41a, rfx4, negr1
DimPlot(seu, reduction = "pca", group.by = "sample")

ElbowPlot(seu, ndims = 50)

seu <- FindNeighbors(seu, dims = 1:25)
## Computing nearest neighbor graph
## Computing SNN
seu <- FindClusters(seu, resolution = 0.6)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 64340
## Number of edges: 2437107
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9407
## Number of communities: 30
## Elapsed time: 19 seconds
## 1 singletons identified. 29 final clusters.
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
seu <- RunUMAP(seu, dims = 1:25)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 15:40:55 UMAP embedding parameters a = 0.9922 b = 1.112
## 15:40:55 Read 64340 rows and found 25 numeric columns
## 15:40:55 Using Annoy for neighbor search, n_neighbors = 30
## 15:40:55 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 15:41:09 Writing NN index file to temp file /Filers/scratch/zhu/4891762//Rtmpbudq8Q/file3d8169a6a35d
## 15:41:09 Searching Annoy index using 1 thread, search_k = 3000
## 15:41:36 Annoy recall = 100%
## 15:41:37 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 15:41:42 Initializing from normalized Laplacian + noise (using irlba)
## 15:41:53 Commencing optimization for 200 epochs, with 2915880 positive edges
## 15:43:26 Optimization finished
# saveRDS(seu, file = "../rds/seu_8batches_RNAonly_soupxFiltered_DoubletFinderFiltered_clustered_subsetArchR.rds")

Clustering results

seu$stage <- factor(seu$stage, levels = c("epiboly-4ss","6-10ss","12-16ss","18-22ss"))
p1 <- DimPlot(seu, label = TRUE, reduction = "umap",cols = alpha(col.ls, 0.3))
p2 <- DimPlot(seu, reduction = "umap", group.by = "genotype",cols = alpha(genotype_cols, 0.4), pt.size = 0.5)
p3 <- DimPlot(seu, reduction = "umap", group.by = "stage", cols = alpha(stage_cols[c(4,6,8,10)], 0.4), pt.size = 0.5)
p1 + p2 + p3

DimPlot(seu, reduction = "umap", group.by = "genotype", cols = alpha(sample_cols, 0.4), pt.size = 0.1, split.by = "stage")

DimPlot(seu, reduction = "umap", group.by = "stage", cols = alpha(stage_cols[c(4,6,8,10)], 0.4), pt.size = 0.1, split.by = "genotype")

p1 <- DimPlot(seu, reduction = "umap", group.by = "sample", cols = alpha(sample_cols, 0.4), pt.size = 0.1)
p1

Expression of known marker genes

FeaturePlot(seu, features = c("foxd3", "foxd3-mCherry","foxd3-citrine","alg6"), pt.size = 0.1, ncol = 4)

known_markers <- c("foxd3", "sox10","pax3a","tfap2a",
                   "sox3","sox19b",
                   "mafba","egr2b","hoxb3a",
                   "tbxta","sox2","msgn1")
FeaturePlot(seu, features = c(known_markers), ncol = 6)

VlnPlot(seu, features = c(known_markers), ncol = 3, pt.size = 0)

VlnPlot(seu, features = c("krt4"), pt.size = 0.1)

FeaturePlot(seu, features = c("elavl3"), pt.size = 0.1)

Find new markers

nc.markers <- FindAllMarkers(seu, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
## Calculating cluster 25
## Calculating cluster 26
## Calculating cluster 27
## Calculating cluster 28
nc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC) -> top2
top2
## # A tibble: 58 × 7
## # Groups:   cluster [29]
##    p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene   
##    <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>  
##  1     0       3.08 0.901 0.246         0 0       pmp22a 
##  2     0       1.79 0.963 0.48          0 0       rxraa  
##  3     0       2.74 0.862 0.132         0 1       ncam1a 
##  4     0       1.97 0.669 0.144         0 1       crb2a  
##  5     0       2.50 0.982 0.533         0 2       cdon   
##  6     0       2.32 0.619 0.107         0 2       alx4a  
##  7     0       3.20 0.809 0.287         0 3       apoc1  
##  8     0       3.14 0.796 0.08          0 3       samd10b
##  9     0       2.65 0.761 0.115         0 4       esrrga 
## 10     0       2.65 0.964 0.339         0 4       kif26ab
## # … with 48 more rows
# top5 <- nc.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
# DoHeatmap(nc_wt, features = (top5$gene))
# df <- data.frame(
#   clusters = 0:22,
#   names = c("NC","neural cells (ncam1a+)", "hindbrain NC (fn1b+)","NC neural","notochord", # 0to4
#             "polster (low nfeature)","epidermis","cranial cartilage (col12a1b+)","cranial cartilage (col11a1b+)",# 5to8
#             "mut-specific (lmx1bb+)","primordial germ cells","mesodermal","pigment", # 9to12
#             "ciliated (dnah2)","sensory","periderm","skeletal mucsle","vascular tubulogenesis",  # 13-17
#             "polster","erythropoiesis","low nfeature","myeloid","retinal development" # 18-22
#             )
# )
# seu$cell_types <- df$names[match(seu$seurat_clusters, df$clusters)]
# 
# DimPlot(seu, label = TRUE, group.by = "cell_types",reduction = "umap")
saveRDS(seu, "/t1-data/project/tsslab/zhu/multiome/analysis_newref/clustering/rds/seuobj_rna/seu_RNAsoupx64340_clustered.rds")
write.csv(nc.markers, "/t1-data/project/tsslab/zhu/multiome/analysis_newref/clustering/results/seu_RNAsoupx64340_markers_20221226.csv")

Technical

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS:   /Filers/package/R-base/4.2.0/lib64/R/lib/libRblas.so
## LAPACK: /Filers/package/R-base/4.2.0/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] scales_1.2.1            RColorBrewer_1.1-3      patchwork_1.1.2        
## [4] sp_1.5-0                SeuratObject_4.1.2.9003 Seurat_4.2.0.9001      
## [7] dplyr_1.0.10           
## 
## loaded via a namespace (and not attached):
##   [1] plyr_1.8.7             igraph_1.3.5           lazyeval_0.2.2        
##   [4] splines_4.2.0          listenv_0.8.0          scattermore_0.8       
##   [7] GenomeInfoDb_1.32.4    ggplot2_3.3.6          digest_0.6.30         
##  [10] htmltools_0.5.3        fansi_1.0.3            magrittr_2.0.3        
##  [13] tensor_1.5             cluster_2.1.4          ROCR_1.0-11           
##  [16] limma_3.52.4           globals_0.16.1         matrixStats_0.62.0    
##  [19] spatstat.sparse_3.0-0  colorspace_2.0-3       ggrepel_0.9.1         
##  [22] xfun_0.34              RCurl_1.98-1.9         jsonlite_1.8.3        
##  [25] progressr_0.11.0       spatstat.data_3.0-0    survival_3.4-0        
##  [28] zoo_1.8-11             glue_1.6.2             polyclip_1.10-4       
##  [31] gtable_0.3.1           zlibbioc_1.42.0        XVector_0.36.0        
##  [34] leiden_0.4.3           future.apply_1.9.1     BiocGenerics_0.42.0   
##  [37] abind_1.4-5            DBI_1.1.3              spatstat.random_3.0-1 
##  [40] miniUI_0.1.1.1         Rcpp_1.0.9             viridisLite_0.4.1     
##  [43] xtable_1.8-4           reticulate_1.26        spatstat.core_2.4-4   
##  [46] stats4_4.2.0           htmlwidgets_1.5.4      httr_1.4.4            
##  [49] ellipsis_0.3.2         ica_1.0-3              pkgconfig_2.0.3       
##  [52] farver_2.1.1           sass_0.4.2             uwot_0.1.14           
##  [55] deldir_1.0-6           utf8_1.2.2             tidyselect_1.2.0      
##  [58] labeling_0.4.2         rlang_1.0.6            reshape2_1.4.4        
##  [61] later_1.3.0            munsell_0.5.0          tools_4.2.0           
##  [64] cachem_1.0.6           cli_3.4.1              generics_0.1.3        
##  [67] ArchR_1.0.2            ggridges_0.5.4         evaluate_0.17         
##  [70] stringr_1.4.1          fastmap_1.1.0          yaml_2.3.6            
##  [73] goftest_1.2-3          knitr_1.40             fitdistrplus_1.1-8    
##  [76] purrr_0.3.5            RANN_2.6.1             pbapply_1.5-0         
##  [79] future_1.28.0          nlme_3.1-160           mime_0.12             
##  [82] ggrastr_1.0.1          compiler_4.2.0         beeswarm_0.4.0        
##  [85] plotly_4.10.0          png_0.1-7              spatstat.utils_3.0-1  
##  [88] tibble_3.1.8           bslib_0.4.0            stringi_1.7.8         
##  [91] highr_0.9              rgeos_0.5-9            lattice_0.20-45       
##  [94] Matrix_1.5-1           vctrs_0.5.0            pillar_1.8.1          
##  [97] lifecycle_1.0.3        spatstat.geom_3.0-3    lmtest_0.9-40         
## [100] jquerylib_0.1.4        RcppAnnoy_0.0.20       data.table_1.14.4     
## [103] cowplot_1.1.1          bitops_1.0-7           irlba_2.3.5.1         
## [106] httpuv_1.6.6           GenomicRanges_1.48.0   R6_2.5.1              
## [109] promises_1.2.0.1       KernSmooth_2.23-20     gridExtra_2.3         
## [112] vipor_0.4.5            IRanges_2.30.1         parallelly_1.32.1     
## [115] codetools_0.2-18       MASS_7.3-58.1          assertthat_0.2.1      
## [118] withr_2.5.0            sctransform_0.3.5      S4Vectors_0.34.0      
## [121] GenomeInfoDbData_1.2.8 mgcv_1.8-41            parallel_4.2.0        
## [124] grid_4.2.0             rpart_4.1.19           tidyr_1.2.1           
## [127] rmarkdown_2.17         Rtsne_0.16             shiny_1.7.3           
## [130] ggbeeswarm_0.6.0